Introduction

The purpose of this notebook is to give data locations, data ingestion code, and code for rudimentary analysis and visualization of COVID-19 data provided by New York Times, [NYT1].

The following steps are taken:

Note that other, older repositories with COVID-19 data exist, like, [JH1, VK1].

Remark: The time series section is done for illustration purposes only. The forecasts there should not be taken seriously.

Preliminary defintions

From the help of tolower:

capwords <- function(s, strict = FALSE) {
    cap <- function(s) paste(toupper(substring(s, 1, 1)),
                  {s <- substring(s, 2); if(strict) tolower(s) else s},
                             sep = "", collapse = " " )
    sapply(strsplit(s,  split = " "), cap, USE.NAMES = !is.null(names(s)))
}

Import data

NYTimes USA states data

dfNYDataStates <- read.csv( "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv", stringsAsFactors = FALSE )
colnames(dfNYDataStates) <- capwords(colnames(dfNYDataStates))
head(dfNYDataStates)
summary(as.data.frame(unclass(dfNYDataStates)))
         Date                State           Fips           Cases            Deaths      
 2020-03-28:  55   Washington   :  77   Min.   : 1.00   Min.   :     1   Min.   :   0.0  
 2020-03-29:  55   Illinois     :  74   1st Qu.:17.00   1st Qu.:     7   1st Qu.:   0.0  
 2020-03-30:  55   California   :  73   Median :30.00   Median :    75   Median :   1.0  
 2020-03-31:  55   Arizona      :  72   Mean   :30.89   Mean   :  1467   Mean   :  34.1  
 2020-04-01:  55   Massachusetts:  66   3rd Qu.:46.00   3rd Qu.:   565   3rd Qu.:  10.0  
 2020-04-02:  55   Wisconsin    :  62   Max.   :78.00   Max.   :130703   Max.   :4758.0  
 (Other)   :1609   (Other)      :1515                                                    

NYTimes USA counties data

dfNYDataCounties <- read.csv( "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv", stringsAsFactors = FALSE )
colnames(dfNYDataCounties) <- capwords(colnames(dfNYDataCounties))
head(dfNYDataCounties)
summary(as.data.frame(unclass(dfNYDataCounties)))
         Date              County                 State            Fips           Cases              Deaths        
 2020-04-06: 2497   Washington:  503   Georgia       : 2243   Min.   : 1001   Min.   :    0.00   Min.   :   0.000  
 2020-04-05: 2444   Unknown   :  494   Texas         : 2097   1st Qu.:17099   1st Qu.:    2.00   1st Qu.:   0.000  
 2020-04-04: 2410   Jefferson :  375   Virginia      : 1475   Median :28115   Median :    4.00   Median :   0.000  
 2020-04-03: 2357   Franklin  :  337   California    : 1462   Mean   :29352   Mean   :   74.45   Mean   :   1.693  
 2020-04-02: 2291   Jackson   :  305   Indiana       : 1329   3rd Qu.:42103   3rd Qu.:   16.00   3rd Qu.:   0.000  
 2020-04-01: 2231   Montgomery:  300   North Carolina: 1311   Max.   :56043   Max.   :68776.00   Max.   :2738.000  
 (Other)   :23967   (Other)   :35883   (Other)       :28280   NA's   :549                                          

US county records

dfUSACountyData <- read.csv( "https://raw.githubusercontent.com/antononcube/SystemModeling/master/Data/dfUSACountyRecords.csv", stringsAsFactors = FALSE )
head(dfUSACountyData)
summary(as.data.frame(unclass(dfUSACountyData)))
         Country          State                   County          FIPS         Population            Lat             Lon         
 UnitedStates:3143   Texas   : 254   WashingtonCounty:  30   Min.   : 1001   Min.   :      89   Min.   :19.60   Min.   :-166.90  
                     Georgia : 159   JeffersonCounty :  25   1st Qu.:18178   1st Qu.:   10980   1st Qu.:34.70   1st Qu.: -98.23  
                     Virginia: 134   FranklinCounty  :  24   Median :29177   Median :   25690   Median :38.37   Median : -90.40  
                     Kentucky: 120   JacksonCounty   :  23   Mean   :30390   Mean   :  102248   Mean   :38.46   Mean   : -92.28  
                     Missouri: 115   LincolnCounty   :  23   3rd Qu.:45082   3rd Qu.:   67507   3rd Qu.:41.81   3rd Qu.: -83.43  
                     Kansas  : 105   MadisonCounty   :  19   Max.   :56045   Max.   :10170292   Max.   :69.30   Max.   : -67.63  
                     (Other) :2256   (Other)         :2999                                                                       

Merge data

dsNYDataCountiesExtended <- 
  dfNYDataCounties %>% 
  dplyr::inner_join( dfUSACountyData %>% dplyr::select_at( .vars = c("FIPS", "Lat", "Lon", "Population") ), by = c( "Fips" = "FIPS" ) )
dsNYDataCountiesExtended

Basic data analysis

ParetoPlotForColumns( dsNYDataCountiesExtended, c("Cases", "Deaths"), scales = "free" )

Geo-histogram

cf <- colorBin( palette = "Reds", domain = log10(dsNYDataCountiesExtended$Cases), bins = 10 )
m <- 
  leaflet( dsNYDataCountiesExtended[, c("Lat", "Lon", "Cases")] ) %>%
  addTiles() %>% 
  addCircleMarkers( ~Lon, ~Lat, radius = ~ log10(Cases), fillColor = ~ cf(log10(Cases)), color = ~ cf(log10(Cases)), fillOpacity = 0.8, stroke = FALSE, popup = ~Cases )
Some values were outside the color scale and will be treated as NASome values were outside the color scale and will be treated as NA
m

Heat-map plots

An alternative of the geo-visualization is to use a heat-map plot.

Cases

Make a heat-map plot by sorting the rows of the cross-tabulation matrix (that correspond to states):

matSDC <- xtabs( Cases ~ State + Date, dfNYDataStates, sparse = TRUE)
d3heatmap::d3heatmap( log10(matSDC+1), cellnote = as.matrix(matSDC), scale = "none", dendrogram = "row", colors = "Blues", theme = "dark")

Deaths

Cross-tabulate states with dates over deaths and plot:

matSDD <- xtabs( Deaths ~ State + Date, dfNYDataStates, sparse = TRUE)
d3heatmap::d3heatmap( log10(matSDD+1), cellnote = as.matrix(matSDD), scale = "none", dendrogram = "row", colors = "Blues", theme = "dark")

Time series analysis

TBD…

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKYXV0aG9yOiBBbnRvbiBBbnRvbm92CmRhdGU6IDIwMjAtMDMtMzAKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3IsIGVjaG89RkFMU0V9CmxpYnJhcnkoZHBseXIpCmxpYnJhcnkoZ2dwbG90MikKbGlicmFyeShsZWFmbGV0KQpsaWJyYXJ5KGQzaGVhdG1hcCkKbGlicmFyeShQYXJldG9QcmluY2lwbGVBZGhlcmVuY2UpCmBgYAoKIyBJbnRyb2R1Y3Rpb24KClRoZSBwdXJwb3NlIG9mIHRoaXMgbm90ZWJvb2sgaXMgdG8gZ2l2ZSBkYXRhIGxvY2F0aW9ucywgZGF0YSBpbmdlc3Rpb24gY29kZSwgYW5kIGNvZGUgZm9yIHJ1ZGltZW50YXJ5IGFuYWx5c2lzIGFuZCB2aXN1YWxpemF0aW9uIG9mIENPVklELTE5IGRhdGEgcHJvdmlkZWQgYnkgTmV3IFlvcmsgVGltZXMsIFtOWVQxXS4gCgpUaGUgZm9sbG93aW5nIHN0ZXBzIGFyZSB0YWtlbjoKCi0gSW5nZXN0IGRhdGEKCiAgLSBUYWtlIENPVklELTE5IGRhdGEgZnJvbSBUaGUgTmV3IFlvcmsgVGltZXMsIGJhc2VkIG9uIHJlcG9ydHMgZnJvbSBzdGF0ZSBhbmQgbG9jYWwgaGVhbHRoIGFnZW5jaWVzLCBbTllUMV0uCgogIC0gVGFrZSBVU0EgY291bnRpZXMgcmVjb3JkcyBkYXRhIChGSVBTIGNvZGVzLCBnZW8tY29vcmRpbmF0ZXMsIHBvcHVsYXRpb25zKSwgW1dSSTFdLgoKLSBNZXJnZSB0aGUgZGF0YS4KCi0gTWFrZSBkYXRhIHN1bW1hcmllcyBhbmQgcmVsYXRlZCBwbG90cy4KCi0gTWFrZSBjb3JyZXNwb25kaW5nIGdlby1wbG90cy4KCk5vdGUgdGhhdCBvdGhlciwgb2xkZXIgcmVwb3NpdG9yaWVzIHdpdGggQ09WSUQtMTkgZGF0YSBleGlzdCwgbGlrZSwgW0pIMSwgVksxXS4KCipSZW1hcms6KiBUaGUgdGltZSBzZXJpZXMgc2VjdGlvbiBpcyBkb25lIGZvciBpbGx1c3RyYXRpb24gcHVycG9zZXMgb25seS4gVGhlIGZvcmVjYXN0cyB0aGVyZSBzaG91bGQgbm90IGJlIHRha2VuIHNlcmlvdXNseS4KCiMgUHJlbGltaW5hcnkgZGVmaW50aW9ucwoKRnJvbSB0aGUgaGVscCBvZiBgdG9sb3dlcmA6CgpgYGB7cn0KY2Fwd29yZHMgPC0gZnVuY3Rpb24ocywgc3RyaWN0ID0gRkFMU0UpIHsKICAgIGNhcCA8LSBmdW5jdGlvbihzKSBwYXN0ZSh0b3VwcGVyKHN1YnN0cmluZyhzLCAxLCAxKSksCiAgICAgICAgICAgICAgICAgIHtzIDwtIHN1YnN0cmluZyhzLCAyKTsgaWYoc3RyaWN0KSB0b2xvd2VyKHMpIGVsc2Ugc30sCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgc2VwID0gIiIsIGNvbGxhcHNlID0gIiAiICkKICAgIHNhcHBseShzdHJzcGxpdChzLCAgc3BsaXQgPSAiICIpLCBjYXAsIFVTRS5OQU1FUyA9ICFpcy5udWxsKG5hbWVzKHMpKSkKfQpgYGAKCiMgSW1wb3J0IGRhdGEKCiMjIE5ZVGltZXMgVVNBIHN0YXRlcyBkYXRhCgpgYGB7cn0KZGZOWURhdGFTdGF0ZXMgPC0gcmVhZC5jc3YoICJodHRwczovL3Jhdy5naXRodWJ1c2VyY29udGVudC5jb20vbnl0aW1lcy9jb3ZpZC0xOS1kYXRhL21hc3Rlci91cy1zdGF0ZXMuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycyA9IEZBTFNFICkKY29sbmFtZXMoZGZOWURhdGFTdGF0ZXMpIDwtIGNhcHdvcmRzKGNvbG5hbWVzKGRmTllEYXRhU3RhdGVzKSkKaGVhZChkZk5ZRGF0YVN0YXRlcykKYGBgCgpgYGB7cn0Kc3VtbWFyeShhcy5kYXRhLmZyYW1lKHVuY2xhc3MoZGZOWURhdGFTdGF0ZXMpKSkKYGBgCgojIyBOWVRpbWVzIFVTQSBjb3VudGllcyBkYXRhCgpgYGB7cn0KZGZOWURhdGFDb3VudGllcyA8LSByZWFkLmNzdiggImh0dHBzOi8vcmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbS9ueXRpbWVzL2NvdmlkLTE5LWRhdGEvbWFzdGVyL3VzLWNvdW50aWVzLmNzdiIsIHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSApCmNvbG5hbWVzKGRmTllEYXRhQ291bnRpZXMpIDwtIGNhcHdvcmRzKGNvbG5hbWVzKGRmTllEYXRhQ291bnRpZXMpKQpoZWFkKGRmTllEYXRhQ291bnRpZXMpCmBgYAoKYGBge3J9CnN1bW1hcnkoYXMuZGF0YS5mcmFtZSh1bmNsYXNzKGRmTllEYXRhQ291bnRpZXMpKSkKYGBgCgojIyBVUyBjb3VudHkgcmVjb3JkcwoKYGBge3J9CmRmVVNBQ291bnR5RGF0YSA8LSByZWFkLmNzdiggImh0dHBzOi8vcmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbS9hbnRvbm9uY3ViZS9TeXN0ZW1Nb2RlbGluZy9tYXN0ZXIvRGF0YS9kZlVTQUNvdW50eVJlY29yZHMuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycyA9IEZBTFNFICkKaGVhZChkZlVTQUNvdW50eURhdGEpCmBgYAoKYGBge3J9CnN1bW1hcnkoYXMuZGF0YS5mcmFtZSh1bmNsYXNzKGRmVVNBQ291bnR5RGF0YSkpKQpgYGAKCiMgTWVyZ2UgZGF0YQoKYGBge3J9CmRzTllEYXRhQ291bnRpZXNFeHRlbmRlZCA8LSAKICBkZk5ZRGF0YUNvdW50aWVzICU+JSAKICBkcGx5cjo6aW5uZXJfam9pbiggZGZVU0FDb3VudHlEYXRhICU+JSBkcGx5cjo6c2VsZWN0X2F0KCAudmFycyA9IGMoIkZJUFMiLCAiTGF0IiwgIkxvbiIsICJQb3B1bGF0aW9uIikgKSwgYnkgPSBjKCAiRmlwcyIgPSAiRklQUyIgKSApCmRzTllEYXRhQ291bnRpZXNFeHRlbmRlZApgYGAKCgojIEJhc2ljIGRhdGEgYW5hbHlzaXMKCmBgYHtyfQpQYXJldG9QbG90Rm9yQ29sdW1ucyggZHNOWURhdGFDb3VudGllc0V4dGVuZGVkLCBjKCJDYXNlcyIsICJEZWF0aHMiKSwgc2NhbGVzID0gImZyZWUiICkKYGBgCgojIEdlby1oaXN0b2dyYW0KCmBgYHtyfQpjZiA8LSBjb2xvckJpbiggcGFsZXR0ZSA9ICJSZWRzIiwgZG9tYWluID0gbG9nMTAoZHNOWURhdGFDb3VudGllc0V4dGVuZGVkJENhc2VzKSwgYmlucyA9IDEwICkKYGBgCgpgYGB7cn0KbSA8LSAKICBsZWFmbGV0KCBkc05ZRGF0YUNvdW50aWVzRXh0ZW5kZWRbLCBjKCJMYXQiLCAiTG9uIiwgIkNhc2VzIildICkgJT4lCiAgYWRkVGlsZXMoKSAlPiUgCiAgYWRkQ2lyY2xlTWFya2VycyggfkxvbiwgfkxhdCwgcmFkaXVzID0gfiBsb2cxMChDYXNlcyksIGZpbGxDb2xvciA9IH4gY2YobG9nMTAoQ2FzZXMpKSwgY29sb3IgPSB+IGNmKGxvZzEwKENhc2VzKSksIGZpbGxPcGFjaXR5ID0gMC44LCBzdHJva2UgPSBGQUxTRSwgcG9wdXAgPSB+Q2FzZXMgKQptCmBgYAoKIyBIZWF0LW1hcCBwbG90cwoKQW4gYWx0ZXJuYXRpdmUgb2YgdGhlIGdlby12aXN1YWxpemF0aW9uIGlzIHRvIHVzZSBhIGhlYXQtbWFwIHBsb3QuCgoKIyMgQ2FzZXMKCk1ha2UgYSBoZWF0LW1hcCBwbG90IGJ5IHNvcnRpbmcgdGhlIHJvd3Mgb2YgdGhlIGNyb3NzLXRhYnVsYXRpb24gbWF0cml4ICh0aGF0IGNvcnJlc3BvbmQgdG8gc3RhdGVzKToKCmBgYHtyfQptYXRTREMgPC0geHRhYnMoIENhc2VzIH4gU3RhdGUgKyBEYXRlLCBkZk5ZRGF0YVN0YXRlcywgc3BhcnNlID0gVFJVRSkKZDNoZWF0bWFwOjpkM2hlYXRtYXAoIGxvZzEwKG1hdFNEQysxKSwgY2VsbG5vdGUgPSBhcy5tYXRyaXgobWF0U0RDKSwgc2NhbGUgPSAibm9uZSIsIGRlbmRyb2dyYW0gPSAicm93IiwgY29sb3JzID0gIkJsdWVzIiwgdGhlbWUgPSAiZGFyayIpCmBgYAoKCkRlYXRocwoKQ3Jvc3MtdGFidWxhdGUgc3RhdGVzIHdpdGggZGF0ZXMgb3ZlciBkZWF0aHMgYW5kIHBsb3Q6CgoKYGBge3J9Cm1hdFNERCA8LSB4dGFicyggRGVhdGhzIH4gU3RhdGUgKyBEYXRlLCBkZk5ZRGF0YVN0YXRlcywgc3BhcnNlID0gVFJVRSkKZDNoZWF0bWFwOjpkM2hlYXRtYXAoIGxvZzEwKG1hdFNERCsxKSwgY2VsbG5vdGUgPSBhcy5tYXRyaXgobWF0U0REKSwgc2NhbGUgPSAibm9uZSIsIGRlbmRyb2dyYW0gPSAicm93IiwgY29sb3JzID0gIkJsdWVzIiwgdGhlbWUgPSAiZGFyayIpCmBgYAoKIyBUaW1lIHNlcmllcyBhbmFseXNpcwoKVEJELi4uCgoKCg==